/hps/research1/birney/users/ian/hmn_fstconda env on cluster# Create env on cluster with mamba
mamba create -y \
-n fst_env_rhel \
-c bioconda gatk4
conda activate fst_env_rhel
mamba install bcftools plink2 r-base r-essentials r-tidyverse r-units libgdal r-sf
# Export
conda env export \
--no-builds \
-f envs/fst_env_rhel.yml
# Activate
conda activate fst_env_rhelrenv# Export env (to renv.lock file)
renv::init()
# To install packages on new system, or 'activate' the env:
renv::restore()Load libaries
java -jar /nfs/software/birney/picard-2.9.0/picard.jar MergeVcfs \
I=human_traits_fst/data/20200205_vcfs.list \
O=vcfs/1gk_all.vcf.gz
# Exception in thread "main" java.lang.IllegalArgumentException: The contig entries in input file /hps/research1/birney/users/ian/rac_hyp/vcfs/ftp.1000genomes.ebi.ac.uk/ALL.chrMT.phase3_callmom-v0_4.20130502.genotypes.vcf.gz are not compatible with the others.
# So remove that one from list above
sed -i '/MT/d' human_traits_fst/data/20200205_vcfs.list
# run MergeVCFs again
java -jar /nfs/software/birney/picard-2.9.0/picard.jar MergeVcfs \
I=human_traits_fst/data/20200205_vcfs.list \
O=vcfs/1gk_all.vcf.gz
# Exception in thread "main" java.lang.IllegalArgumentException: The contig entries in input file /hps/research1/birney/users/ian/rac_hyp/vcfs/ftp.1000genomes.ebi.ac.uk/ALL.chrY.phase3_integrated_v2a.20130502.genotypes.vcf.gz are not compatible with the others.
sed -i '/chrY/d' human_traits_fst/data/20200205_vcfs.list
# run MergeVCFs again
java -jar /nfs/software/birney/picard-2.9.0/picard.jar MergeVcfs \
I=human_traits_fst/data/20200205_vcfs.list \
O=vcfs/1gk_all.vcf.gz
# SUCCESSFrom Lee et al. (2019) Gene discovery and polygenic prediction from a genome-wide association study of educational attainment in 1.1 million individuals, Nature: https://www.nature.com/articles/s41588-018-0147-3.
Data download link: https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-018-0147-3/MediaObjects/41588_2018_147_MOESM3_ESM.xlsx
Saved here: data/20180723_Lee-et-al_supp-tables.xlsx
Notes: - European-ancestry cohort only. - n = 1,131,881 individuals. - SNP panel: ~10M from 1GKph3. - Phenotype: number of years of schooling that individuals completed (EduYears). - Identified 1271 independent SNPs at the genome-wide significance level of 5e-8 (or 995 at P < 1e-8). - COJO: reduced to 765 at P < 5e-8: https://doi.org/10.1038/ng.2213 - Corrected by LD score regression: only ~5% of inflation attributable to bias. LD score intercept of 1.11.
From Yengo et al. (2018) Meta-analysis of genome-wide association studies for height and body mass index in approximately 700000 individuals of European ancestry, Human Molecular Genetics: https://academic.oup.com/hmg/article-abstract/27/20/3641/5067845.
Notes: - European-ancestry cohort only. - Meta-analysis combining GIANT and UKBioBank cohorts from Wood et al. (2014) https://doi.org/10.1038/ng.3097 (height) and Locke et al. (2015) https://doi.org/10.1038/nature14177 (BMI). - n = Up to 795,624 individuals (456426 from UKBioBank and the rest (339198?) from GIANT). - SNP panel: ~2.4M HM2 SNPs. - Identified 3290 (2388 main, 902 secondary; 712 loci) and 941 (656 main, 285 secondary; 536 loci) near-independent SNPs associated with height and BMI respectively, at a genome-wide significance threshold of P < 1e−8). - Analysed with COJO: https://doi.org/10.1038/ng.2213 - Corrected by LD score regression: LD score intercept of 1.48 and 1.03 for height and BMI respectively.
Data downloaded from this webpage: https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files Data download link: https://portals.broadinstitute.org/collaboration/giant/images/4/4b/Meta-analysis_Wood_et_al%2BUKBiobank_2018_top_3290_from_COJO_analysis.txt.gz
cd human_traits_fst/data
# download
wget https://portals.broadinstitute.org/collaboration/giant/images/4/4b/Meta-analysis_Wood_et_al%2BUKBiobank_2018_top_3290_from_COJO_analysis.txt.gz
# unzip
gunzip Meta-analysis_Wood_et_al%2BUKBiobank_2018_top_3290_from_COJO_analysis.txt.gz
# rename
mv Meta-analysis_Wood_et_al+UKBiobank_2018_top_3290_from_COJO_analysis.txt 20181015_Yengo-et-al_snps_height.txtSaved here: data/20181015_Yengo-et-al_snps_height.txt
From Yengo et al. (2018) Meta-analysis of genome-wide association studies for height and body mass index in approximately 700000 individuals of European ancestry, Human Molecular Genetics: https://academic.oup.com/hmg/article-abstract/27/20/3641/5067845. (Same study as above.)
Data download link: https://portals.broadinstitute.org/collaboration/giant/images/e/e2/Meta-analysis_Locke_et_al%2BUKBiobank_2018_top_941_from_COJO_analysis_UPDATED.txt.gz
# download
wget -P data https://portals.broadinstitute.org/collaboration/giant/images/e/e2/Meta-analysis_Locke_et_al%2BUKBiobank_2018_top_941_from_COJO_analysis_UPDATED.txt.gz
# unzip
gunzip data/Meta-analysis_Locke_et_al+UKBiobank_2018_top_941_from_COJO_analysis_UPDATED.txt.gz
# rename
mv data/Meta-analysis_Locke_et_al+UKBiobank_2018_top_941_from_COJO_analysis_UPDATED.txt data/20181015_Yengo-et-al_snps_bmi.txtFrom Huang et al. (2017) Fine-mapping inflammatory bowel disease loci to single-variant resolution, Nature: https://www.nature.com/articles/nature22969#Sec29.
Notes: - European-ancestry cohort - n = 67,852 individuals - Identified 94 loci - Take from list of credible sets those with “1” in signal column.
Data download link (Supplementary Table 1): https://static-content.springer.com/esm/art%3A10.1038%2Fnature22969/MediaObjects/41586_2017_BFnature22969_MOESM2_ESM.xlsx
Saved here: data/20170628_Huang-et-al_supp-table-1.xlsx
From Jostins et al. (2012) Host–microbe interactions have shaped the genetic architecture of inflammatory bowel disease, Nature: https://www.nature.com/articles/nature11582
Notes: - European-ancestry cohort - n = 41,660 individuals - SNP panel: 1.23M imputed SNPs from HM3 - Identified 193 independent SNPs associated with at least one of Crohn’s, UC, or IBD, at genome-wide significance of P < 5e-8, merged into 163 loci.
Data download link (Supplementary Table 2): https://static-content.springer.com/esm/art%3A10.1038%2Fnature11582/MediaObjects/41586_2012_BFnature11582_MOESM90_ESM.zip
Saved here: data/20121031_Jostins-et-al_supp-table-2.xlsx
[NOTE: All other traits have SNP-hits based on only European-ancestry populations. Here, the SNP hits come from various populations. Potential bias? Or if we only had pigmentation SNP-hits from European populations, would we expect the Fst to be even more skewed?]{colour = “red”}
data/20171117_Crawford-et-al_Table-1.xlsxdata/20190121_Adhikari-et-al_snps.xlsxdata/20170316_Hernandez-Pacheco-et-al.xlsx.Others compiled into the single XLSX doc data/20200622_pigmentation_snps.xlsx:
pig_snps <- list()
# Crawford
pig_snps[["crawford"]] <- readxl::read_excel(here("data", "20171117_Crawford-et-al_Table-1.xlsx")) %>%
dplyr::select(rsid = "RSID", everything()) %>%
dplyr::filter(!is.na(rsid),
!rsid == ".")
# Adhikari
pig_snps[["adhikari_tbl_1"]] <- readxl::read_excel(here("data", "20190121_Adhikari-et-al_snps.xlsx"),
sheet = "Table 1",
skip = 1) %>%
dplyr::select(rsid = "rsID", everything()) %>%
dplyr::filter(!is.na(rsid))
pig_snps[["adhikari_supp_6"]] <- readxl::read_excel(here("data", "20190121_Adhikari-et-al_snps.xlsx"),
sheet = "supp_table_6") %>%
dplyr::select(rsid = "SNP", everything()) %>%
dplyr::filter(!is.na(rsid))
pig_snps[["adhikari_supp_12"]] <- readxl::read_excel(here("data", "20190121_Adhikari-et-al_snps.xlsx"),
sheet = "supp_table_12") %>%
dplyr::select(rsid = "SNP", everything()) %>%
dplyr::filter(!is.na(rsid))
# Hernandez-Pacheco
pig_snps[["hernandez-pacheco"]] <- readxl::read_excel(here("data", "20170316_Hernandez-Pacheco-et-al.xlsx"))
# Doc with SNPs from multiple studies
sheet_names <- readxl::excel_sheets(here("data", "20200622_pigmentation_snps.xlsx"))
compiled_snps <- lapply(sheet_names, function(x){
x <- readxl::read_excel(here("data", "20200622_pigmentation_snps.xlsx"),
sheet = x)
})
names(compiled_snps) <- sheet_names
# Combine
pig_snps <- c(pig_snps, compiled_snps)## extract from excel doc
snps_eduyrs <- readxl::read_xlsx(here::here("data", "20180723_Lee-et-al_supp-tables.xlsx"), sheet = "2. EduYears Lead SNPs", skip = 1, n_max = 1271)
## write table of SNPs
write.table(snps_eduyrs[["SNP"]], here::here("data", "snps_edu.list"), quote = F, row.names = F, col.names = F)# extract from excel doc
snps_ibd <- readxl::read_xlsx(path = here::here("data", "20170628_Huang-et-al_supp-table-1.xlsx"),
sheet = "list of credible sets") %>%
dplyr::filter(signal == 1) %>% # take first-ranked SNP in locus
dplyr::filter(!grepl("chr", variant.lead)) %>% # remove SNPs without rsID
dplyr::select(variant.lead) # take just rsID
# write tables of SNPs
write.table(x = snps_ibd$variant.lead,
file = here::here("data", "snps_ibd.list"),
quote = F,
row.names = F,
col.names = F)Plink2Downloded via this page: http://www.internationalgenome.org/data Download link: http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/20130606_sample_info.xlsx.
Saved here: data/20130606_sample_info.xlsx
meta = readxl::read_xlsx(here::here("data", "20130606_sample_info.xlsx"),
sheet = "Sample Info") %>%
dplyr::select(Sample, Population, Gender)
knitr::kable(head(meta))| Sample | Population | Gender |
|---|---|---|
| HG00096 | GBR | male |
| HG00097 | GBR | female |
| HG00098 | GBR | male |
| HG00099 | GBR | female |
| HG00100 | GBR | female |
| HG00101 | GBR | male |
Plink2traits=$(echo edu hei bmi ibd pig)
# Get AF per population
for trait in $(echo $traits); do
# make directory
mkdir -p human_traits_fst/data/20200622_plink2_alfreqs/$trait ;
# create allele-freq tables
plink2 \
--vcf vcfs/snphits_$trait.vcf.gz \
--freq \
--max-alleles 2 \
--pheno iid-only human_traits_fst/data/plink2_sample_popn_key.txt \
--loop-cats PHENO1 \
--out human_traits_fst/data/20200622_plink2_alfreqs/$trait/$trait ;
done
# Get global AF
for trait in $(echo $traits); do
# make directory
mkdir -p human_traits_fst/data/20200622_plink2_alfreqs/$trait ;
# create allele-freq tables
bsub \
-o log/alfreq_global_$trait.out \
-e log/alfreq_global_$trait.err \
"""
conda activate fst_env_rhel ;
plink2 \
--vcf vcfs/snphits_$trait.vcf.gz \
--freq \
--max-alleles 2 \
--out human_traits_fst/data/20200622_plink2_alfreqs/$trait/$trait.all
""";
done# Create factor levels for `trait`
trait_levels = c("edu", "hei", "bmi", "ibd", "pig")
# Create vector for recoding traits with full names
recode_vec = c("edu" = "Educational attainment",
"hei" = "Height",
"bmi" = "BMI",
"ibd" = "IBD",
"pig" = "Pigmentation")
# Colour palette
fill_pal = c("Educational attainment" = "#00afbb",
"Height" = "#FFBF00",
"BMI" = "#0bc166",
"IBD" = "#360568",
"Pigmentation" = "#fc4e07")# Get list of files
target_files = list.files(here::here("data", "20200622_plink2_alfreqs"),
pattern = "all.afreq",
recursive = T,
full.names = T)
# Read in data
alfreq_list_glob = lapply(target_files, function(file){
trait_df = read.table(file,
header = T,
comment.char = "")
})
# Set names
names(alfreq_list_glob) = gsub(pattern = ".all.afreq",
replacement = "",
basename(target_files))
# Merge into single Df
alfreq_glob_df = dplyr::bind_rows(alfreq_list_glob, .id = "trait")
# Get major and minor alleles and frequency
alfreq_glob_df = alfreq_glob_df %>%
dplyr::mutate(MAJ = if_else(ALT_FREQS <= 0.5,
REF,
ALT),
MIN = ifelse(ALT_FREQS <= 0.5,
ALT,
REF),
MAF = ifelse(ALT_FREQS <= 0.5,
ALT_FREQS,
1 - ALT_FREQS))
# Make `trait` a factor
alfreq_glob_df$trait = factor(alfreq_glob_df$trait, levels = trait_levels)
# Recode traits for plotting
alfreq_glob_df$trait = dplyr::recode(alfreq_glob_df$trait, !!!recode_vec)Plot
alfreq_glob_df %>%
ggplot(aes(MAF, fill = trait)) +
geom_histogram() +
scale_fill_manual(values = fill_pal) +
facet_wrap(~trait, nrow = 2) +
guides(fill = F) +
theme_bw() +
ggtitle("Global MAF distributions")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Save
ggsave(here("plots", "20210114_global_maf_distributions.svg"),
device = "svg",
units = "cm",
dpi = 400,
height = 12,
width = 19.5)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
target_dirs <- list.dirs(here::here("data", "20200622_plink2_alfreqs"), recursive = F)
al_freq_lst <- lapply(target_dirs, function(x){
target_files <- list.files(x, pattern = ".afreq", full.names = T)
# read in data
data_lst <- lapply(target_files, function(target_file){
read.table(target_file,
header = T,
comment.char = "")
})
# fix names of populations
names(data_lst) <- gsub(pattern = "edu.|hei.|bmi.|ibd.|pig.|.afreq",
replacement = "",
x = list.files(x, pattern = ".afreq"))
return(data_lst)
})
# set names
names(al_freq_lst) <- basename(target_dirs)set.seed(65)
rdm_sds <- sample(1:100, 5)
counter <- 0
al_freq_df_shuff <- lapply(al_freq_df, function(pheno_df){
counter <<- counter + 1
# set seed
set.seed(rdm_sds[counter])
# select SNPs to swap (half of total)
tgt_indcs <- sample(nrow(pheno_df), nrow(pheno_df) /2)
# swap minor alleles
pheno_df[tgt_indcs, 5:ncol(pheno_df)] <- 1 - pheno_df[tgt_indcs, 5:ncol(pheno_df)]
# return pheno_df
return(pheno_df)
})Pull out random SNPs with the same allele frequencies as the GWAS SNP-hits
With Plink2
# Per chromosome
for chr in $(seq 1 24) ; do
# make directory
mkdir -p big_data/20210114_alfreqs_all/by_chr/$chr;
# create allele-freq tables
bsub \
-M 10000 \
-o log/20210114_plink_alfreq_$chr.out \
-e log/20210114_plink_alfreq_$chr.err \
"plink2 \
--vcf vcfs/1gk_all.vcf.gz \
--freq \
--chr $chr \
--max-alleles 2 \
--pheno iid-only human_traits_fst/data/plink2_sample_popn_key.txt \
--loop-cats PHENO1 \
--out big_data/20210114_alfreqs_all/by_chr/$chr/$chr ";
done target_dirs <- list.dirs(here::here("data", "20200622_plink2_alfreqs"), recursive = F)
al_freq_lst <- lapply(target_dirs, function(x){
target_files <- list.files(x, pattern = ".afreq", full.names = T)
# read in data
data_lst <- lapply(target_files, function(target_file){
read.table(target_file,
header = T,
comment.char = "")
})
# fix names of populations
names(data_lst) <- gsub(pattern = "edu.|hei.|bmi.|ibd.|pig.|.afreq",
replacement = "",
x = list.files(x, pattern = ".afreq"))
return(data_lst)
})
# set names
names(al_freq_lst) <- basename(target_dirs)
# reorder for (1) height, (2) eduyears, (3) ibd, (4) pigmentation
al_freq_lst <- al_freq_lst[c(2, 1, 3, 4)]set.seed(65)
rdm_sds <- sample(1:100, 4)
counter <- 0
al_freq_df_shuff <- lapply(al_freq_df, function(pheno){
counter <<- counter + 1
# set seed
set.seed(rdm_sds[counter])
# select SNPs to swap (half of total)
tgt_indcs <- sample(nrow(pheno), nrow(pheno) /2)
# swap minor alleles
pheno[tgt_indcs, 5:ncol(pheno)] <- 1 - pheno[tgt_indcs, 5:ncol(pheno)]
# return pheno
return(pheno)
})# Set up titles vector
titles <- c("Height", "Educational Attainment", "Inflammatory Bowel Disease", "Pigmentation")counter <- 0
lapply(al_freq_df_shuff, function(pheno){
counter <<- counter + 1
ggplot(pheno,
aes(YRI, CHS)) +
geom_point(size = 0.5) +
coord_fixed() +
geom_smooth(se = F, colour = "red") +
geom_abline(intercept = 0, slope = 1, colour = "blue") +
xlab("Allele frequency in YRI") +
ylab("Allele frequency in CHS") +
labs(title = titles[counter])
})## $edu
##
## $bmi
##
## $hei
##
## $ibd
counter <- 0
lapply(al_freq_df_shuff, function(pheno){
counter <<- counter + 1
ggplot(pheno,
aes(YRI, CEU)) +
geom_point(size = 0.5) +
coord_fixed() +
geom_smooth(se = F, colour = "red") +
geom_abline(intercept = 0, slope = 1, colour = "blue") +
xlab("Allele frequency in YRI") +
ylab("Allele frequency in CEU") +
labs(title = titles[counter])
})## $edu
##
## $bmi
##
## $hei
##
## $ibd
colourscales <- c("Viridis", "Hot", "Bluered", "Electric")
titles <- c("Height", "Educational Attainment", "IBD", "Skin/hair pigmentation")
counter <- 0
plts <- lapply(al_freq_df_shuff, function(pheno){
counter <<- counter + 1
# set graph resolution
graph_reso <- 0.05
# get lm for data
loess_model <- loess(CEU ~ 0 + CHS + YRI, data = pheno)
# set up axes
axis_x <- seq(min(pheno$CHS), max(pheno$CHS), by = graph_reso)
axis_y <- seq(min(pheno$YRI), max(pheno$YRI), by = graph_reso)
# sample points
lm_surface <- expand.grid(CHS = axis_x,
YRI = axis_y,
KEEP.OUT.ATTRS = F)
lm_surface$CEU <- predict(loess_model, newdata = lm_surface)
lm_surface <- reshape2::acast(lm_surface, YRI ~ CHS, value.var = "CEU")
# create plot
plt <- plot_ly(pheno,
x = ~CHS,
y = ~YRI,
z = ~CEU,
type = "scatter3d",
mode = "markers",
marker = list(size = 2),
text = pheno$ID)
plt <- add_trace(plt,
z = lm_surface,
x = axis_x,
y = axis_y,
type = "surface",
colorscale = colourscales[counter]) %>%
layout(title = titles[counter])
return(plt)
})
plts$hei## NULL
# Create raw list of variants
vcf_list_raw <- lapply(target_vcfs, function(vcf_file){
vcf_out <- pegas::read.vcf(vcf_file)
})
# Create vector of populations
populations <- unlist(lapply(rownames(vcf_list_raw[[1]]), function(sample){
meta$Population[meta$Sample == sample]
}))
# Generate Fst stats
fst_out_lst <- lapply(vcf_list_raw, function(pheno){
as.data.frame(pegas::Fst(pheno, pop = populations))
})
# make rownames into separate column
fst_out_lst <- lapply(fst_out_lst, function(pheno){
pheno$snp <- rownames(pheno)
return(pheno)
})
names(fst_out_lst) <- titles
# bind into single DF
fst_out_df <- dplyr::bind_rows(fst_out_lst, .id = "phenotype")
# Set order of phenotypes
fst_out_df$phenotype <- factor(fst_out_df$phenotype, levels = c("Height", "Educational Attainment", "IBD", "Skin/hair pigmentation"))
head(fst_out_df)ggplot(fst_out_df, aes(Fst, fill = phenotype)) +
geom_density(alpha = 0.7) +
labs(fill = "Phenotype") +
ylab("Density") +
theme_bw() +
scale_fill_manual(values = c("#E7B800", "#00AFBB", "#360568", "#FC4E07"))# factorise
fst_out_df$phenotype_3d <- factor(fst_out_df$phenotype,
levels = c("Skin/hair pigmentation", "IBD", "Educational Attainment", "Height"))
ggplot() +
geom_density_ridges2(data = fst_out_df,
mapping = aes(x = Fst, y = phenotype_3d, fill = phenotype_3d),
scale = 2) +
scale_fill_manual(values = c("#FC4E07", "#360568", "#00AFBB", "#E7B800")) +
ylab(label = NULL) +
theme_bw() +
guides(fill = guide_legend(reverse=T,
title = "Phenotype")) +
scale_y_discrete(expand = expand_scale(add = c(0.2, 2.3)))# Height v EA
ks.test(fst_out_df$Fst[fst_out_df$phenotype == "Height"],
fst_out_df$Fst[fst_out_df$phenotype == "Educational Attainment"])##
## Two-sample Kolmogorov-Smirnov test
##
## data: fst_out_df$Fst[fst_out_df$phenotype == "Height"] and fst_out_df$Fst[fst_out_df$phenotype == "Educational Attainment"]
## D = 0.039184, p-value = 0.1203
## alternative hypothesis: two-sided
# Height v IBD
ks.test(fst_out_df$Fst[fst_out_df$phenotype == "Height"],
fst_out_df$Fst[fst_out_df$phenotype == "IBD"])##
## Two-sample Kolmogorov-Smirnov test
##
## data: fst_out_df$Fst[fst_out_df$phenotype == "Height"] and fst_out_df$Fst[fst_out_df$phenotype == "IBD"]
## D = 0.086335, p-value = 1.111e-06
## alternative hypothesis: two-sided
# Height v Pigmentation
ks.test(fst_out_df$Fst[fst_out_df$phenotype == "Height"],
fst_out_df$Fst[fst_out_df$phenotype == "Skin/hair pigmentation"])##
## Two-sample Kolmogorov-Smirnov test
##
## data: fst_out_df$Fst[fst_out_df$phenotype == "Height"] and fst_out_df$Fst[fst_out_df$phenotype == "Skin/hair pigmentation"]
## D = 0.30629, p-value < 2.2e-16
## alternative hypothesis: two-sided
# get samples from target popns only
target_popns <- which(populations %in% c("YRI", "CEU", "CHS"))
populations_3pop <- populations[target_popns]
vcf_list_raw_3pop <- lapply(vcf_list_raw, function(pheno){
pheno[target_popns, ]
})
# Generate Fst stats
fst_out_lst_3pop <- lapply(vcf_list_raw_3pop, function(pheno){
as.data.frame(pegas::Fst(pheno, pop = populations_3pop))
})
# make rownames into separate column
fst_out_lst_3pop <- lapply(fst_out_lst_3pop, function(pheno){
pheno$snp <- rownames(pheno)
return(pheno)
})
names(fst_out_lst_3pop) <- titles
# bind into single DF
fst_out_df_3pop <- dplyr::bind_rows(fst_out_lst_3pop, .id = "phenotype")
head(fst_out_df_3pop)## phenotype Fit Fst Fis snp
## rs780569 Height 1 0.10356157 1 rs780569
## rs34394051 Height 1 0.02428879 1 rs34394051
## rs11121177 Height 1 0.14444609 1 rs11121177
## rs4846010 Height 1 0.09716410 1 rs4846010
## rs78116078 Height 1 0.17017201 1 rs78116078
## rs10799615 Height 1 0.17098100 1 rs10799615
ggplot(fst_out_df_3pop, aes(Fst, fill = phenotype)) +
geom_density(alpha = 0.7) +
labs(fill = "Phenotype") +
ylab("Density") +
theme_bw() +
scale_fill_manual(values = c("#E7B800", "#00AFBB", "#360568", "#FC4E07"))# factorise
fst_out_df_3pop$phenotype_3d <- factor(fst_out_df$phenotype,
levels = c("Skin/hair pigmentation", "IBD", "Educational Attainment", "Height"))
ggplot() +
geom_density_ridges2(data = fst_out_df_3pop,
mapping = aes(x = Fst, y = phenotype_3d, fill = phenotype_3d),
scale = 2) +
scale_fill_manual(values = c("#FC4E07", "#360568", "#00AFBB", "#E7B800")) +
ylab(label = NULL) +
theme_bw() +
guides(fill = guide_legend(reverse=T,
title = "Phenotype")) +
scale_y_discrete(expand = expand_scale(add = c(0.2, 2.3)))# Height v EA
ks.test(fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Height"],
fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Educational Attainment"])##
## Two-sample Kolmogorov-Smirnov test
##
## data: fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Height"] and fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Educational Attainment"]
## D = 0.025151, p-value = 0.6096
## alternative hypothesis: two-sided
# Height v IBD
ks.test(fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Height"],
fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "IBD"])##
## Two-sample Kolmogorov-Smirnov test
##
## data: fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Height"] and fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "IBD"]
## D = 0.065523, p-value = 0.0005063
## alternative hypothesis: two-sided
# Height v Pigmentation
ks.test(fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Height"],
fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Skin/hair pigmentation"])##
## Two-sample Kolmogorov-Smirnov test
##
## data: fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Height"] and fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Skin/hair pigmentation"]
## D = 0.30608, p-value < 2.2e-16
## alternative hypothesis: two-sided